knitr::opts_chunk$set(fig.width=22, fig.height=20)
library(tidyverse)
library(caret)
library(mgcv)
library(visreg)
Importing Cleaned Data
ames = read_csv('./data/train_cleanest.csv')
test = read_csv('./data/test_cleanest.csv')
ames = ames[,order(colnames(ames))]
test= test[,order(colnames(test))]
ames = ames %>% dplyr::select(-X1)
test = test %>% dplyr::select(-X1)
ames = ames[,order(colnames(ames))] %>%
rename('FirstFlrSF' = "1stFlrSF", 'SecFlrSF' = '2ndFlrSF', 'ThreeSeaPorch' = '3SsnPorch')
test = test[,order(colnames(test))] %>%
rename('FirstFlrSF' = "1stFlrSF", 'SecFlrSF' = '2ndFlrSF', 'ThreeSeaPorch' = '3SsnPorch')
ames$PriceRange <- factor(ames$PriceRange, levels = c("High", "Middle", "Low"))
test$PriceRange <- factor(test$PriceRange, levels = c("High", "Middle", "Low"))
ames$QualCond = ames$OverallCond * ames$OverallQual
test$QualCond = test$OverallCond * test$OverallQual
set.seed(3)
train.idx = sample(1:nrow(ames), 8*nrow(ames)/10)
ames_train = ames[train.idx,]
ames_test = ames[-train.idx,]
## Base GAM Model
ames.gam.base <- mgcv::gam(logPrice
~ s(TotalSF, by=PriceRange)
+ s(QualCond, by= PriceRange)
+ s(Age)
+ Fireplaces:PriceRange
+ s(MSSubClass)
+ Neighborhood:PriceRange
+ s(GarageArea, by=GarageCars),
method='GCV.Cp',
data=ames)
## Base LM Model
ames.lm <- lm(logPrice
~ TotalSF:PriceRange
+ QualCond
+ Age
+ Fireplaces:PriceRange
+ MSSubClass
+ Neighborhood:PriceRange
+ GarageArea:GarageCars,
data=ames)
true = ames_test[,'SalePrice'][[1]]
# gam.predictions = predict.gam(ames.gam, newdata = ames_test, type = 'response')
# gam.errors = gam.predictions - log(true)
# gam.diff =exp(gam.predictions) - true
gam.base.predictions = predict.gam(ames.gam.base, newdata = ames_test, type = 'response') #log preds
gam.base.errors = gam.base.predictions - log(true) # residuals
gam.base.diff =exp(gam.base.predictions) - true # $ diff
lm.predictions = predict(ames.lm, newdata = ames_test)
prediction from a rank-deficient fit may be misleading
lm.errors = lm.predictions - log(true)
lm.diff = exp(lm.predictions) - true
print(paste('GAM RMSE:',sqrt(mean(gam.base.errors^2))))
[1] "GAM RMSE: 0.121294811423439"
print(paste('LM RMSE:',sqrt(mean(lm.errors^2))))
[1] "LM RMSE: 0.130157662484692"
print(paste('GAM: Error in $ as Predicted:',round(mean(abs(gam.base.diff)),2)))
[1] "GAM: Error in $ as Predicted: 14891.7"
print(paste('LM: Error in $ as Predicted:',round(mean(abs(lm.diff)),2)))
[1] "LM: Error in $ as Predicted: 15965.89"
summary(ames.gam.base)
Family: gaussian
Link function: identity
Formula:
logPrice ~ s(TotalSF, by = PriceRange) + s(QualCond, by = PriceRange) +
s(Age) + Fireplaces:PriceRange + s(MSSubClass) + Neighborhood:PriceRange +
s(GarageArea, by = GarageCars)
Parametric coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 11.641766 0.016218 717.809 < 2e-16 ***
Fireplaces:PriceRangeHigh 0.055237 0.012139 4.550 5.83e-06 ***
Fireplaces:PriceRangeMiddle 0.042844 0.008901 4.813 1.65e-06 ***
Fireplaces:PriceRangeLow 0.073847 0.015465 4.775 1.99e-06 ***
PriceRangeHigh:NeighborhoodBlmngtn 0.344139 0.074638 4.611 4.39e-06 ***
PriceRangeMiddle:NeighborhoodBlmngtn 0.240303 0.039344 6.108 1.31e-09 ***
PriceRangeLow:NeighborhoodBlmngtn 0.000000 0.000000 NA NA
PriceRangeHigh:NeighborhoodBlueste 0.000000 0.000000 NA NA
PriceRangeMiddle:NeighborhoodBlueste 0.201568 0.090975 2.216 0.026880 *
PriceRangeLow:NeighborhoodBlueste 0.000000 0.000000 NA NA
PriceRangeHigh:NeighborhoodBrDale 0.000000 0.000000 NA NA
PriceRangeMiddle:NeighborhoodBrDale 0.000000 0.000000 NA NA
PriceRangeLow:NeighborhoodBrDale 0.138927 0.042398 3.277 0.001076 **
PriceRangeHigh:NeighborhoodBrkSide 0.000000 0.000000 NA NA
PriceRangeMiddle:NeighborhoodBrkSide 0.237728 0.031846 7.465 1.48e-13 ***
PriceRangeLow:NeighborhoodBrkSide 0.072044 0.027134 2.655 0.008019 **
PriceRangeHigh:NeighborhoodClearCr 0.337947 0.031723 10.653 < 2e-16 ***
PriceRangeMiddle:NeighborhoodClearCr 0.252277 0.044608 5.655 1.89e-08 ***
PriceRangeLow:NeighborhoodClearCr 0.000000 0.000000 NA NA
PriceRangeHigh:NeighborhoodCollgCr 0.307020 0.019342 15.873 < 2e-16 ***
PriceRangeMiddle:NeighborhoodCollgCr 0.288363 0.028856 9.993 < 2e-16 ***
PriceRangeLow:NeighborhoodCollgCr 0.210468 0.028391 7.413 2.15e-13 ***
PriceRangeHigh:NeighborhoodCrawfor 0.269848 0.026944 10.015 < 2e-16 ***
PriceRangeMiddle:NeighborhoodCrawfor 0.224607 0.037474 5.994 2.62e-09 ***
PriceRangeLow:NeighborhoodCrawfor 0.000000 0.000000 NA NA
PriceRangeHigh:NeighborhoodEdwards 0.412058 0.124327 3.314 0.000943 ***
PriceRangeMiddle:NeighborhoodEdwards 0.179379 0.026754 6.705 2.94e-11 ***
PriceRangeLow:NeighborhoodEdwards 0.137103 0.021285 6.441 1.64e-10 ***
PriceRangeHigh:NeighborhoodGilbert 0.290500 0.025019 11.611 < 2e-16 ***
PriceRangeMiddle:NeighborhoodGilbert 0.245987 0.025719 9.565 < 2e-16 ***
PriceRangeLow:NeighborhoodGilbert 0.000000 0.000000 NA NA
PriceRangeHigh:NeighborhoodIDOTRR 0.000000 0.000000 NA NA
PriceRangeMiddle:NeighborhoodIDOTRR 0.268432 0.089837 2.988 0.002858 **
PriceRangeLow:NeighborhoodIDOTRR -0.031807 0.027065 -1.175 0.240102
PriceRangeHigh:NeighborhoodMeadowV 0.000000 0.000000 NA NA
PriceRangeMiddle:NeighborhoodMeadowV 0.142333 0.126316 1.127 0.260025
PriceRangeLow:NeighborhoodMeadowV 0.093731 0.041757 2.245 0.024947 *
PriceRangeHigh:NeighborhoodMitchel 0.228196 0.049169 4.641 3.80e-06 ***
PriceRangeMiddle:NeighborhoodMitchel 0.211249 0.030170 7.002 3.95e-12 ***
PriceRangeLow:NeighborhoodMitchel 0.179612 0.030395 5.909 4.33e-09 ***
PriceRangeHigh:NeighborhoodNAmes 0.133258 0.037750 3.530 0.000429 ***
PriceRangeMiddle:NeighborhoodNAmes 0.152418 0.015605 9.767 < 2e-16 ***
PriceRangeLow:NeighborhoodNAmes 0.219770 0.031587 6.958 5.35e-12 ***
PriceRangeHigh:NeighborhoodNoRidge 0.331654 0.027797 11.931 < 2e-16 ***
PriceRangeMiddle:NeighborhoodNoRidge 0.000000 0.000000 NA NA
PriceRangeLow:NeighborhoodNoRidge 0.000000 0.000000 NA NA
PriceRangeHigh:NeighborhoodNPkVill 0.000000 0.000000 NA NA
PriceRangeMiddle:NeighborhoodNPkVill 0.209417 0.045940 4.559 5.61e-06 ***
PriceRangeLow:NeighborhoodNPkVill 0.000000 0.000000 NA NA
PriceRangeHigh:NeighborhoodNridgHt 0.397193 0.023636 16.805 < 2e-16 ***
PriceRangeMiddle:NeighborhoodNridgHt 0.348314 0.089432 3.895 0.000103 ***
PriceRangeLow:NeighborhoodNridgHt 0.000000 0.000000 NA NA
PriceRangeHigh:NeighborhoodNWAmes 0.195002 0.075131 2.595 0.009547 **
PriceRangeMiddle:NeighborhoodNWAmes 0.203194 0.019970 10.175 < 2e-16 ***
PriceRangeLow:NeighborhoodNWAmes 0.000000 0.000000 NA NA
PriceRangeHigh:NeighborhoodOldTown 0.261913 0.202631 1.293 0.196380
PriceRangeMiddle:NeighborhoodOldTown 0.070476 0.037437 1.883 0.059975 .
PriceRangeLow:NeighborhoodOldTown 0.018788 0.019332 0.972 0.331288
PriceRangeHigh:NeighborhoodSawyer 0.000000 0.000000 NA NA
PriceRangeMiddle:NeighborhoodSawyer 0.149423 0.020412 7.321 4.20e-13 ***
PriceRangeLow:NeighborhoodSawyer 0.124801 0.040541 3.078 0.002122 **
PriceRangeHigh:NeighborhoodSawyerW 0.290633 0.028635 10.150 < 2e-16 ***
PriceRangeMiddle:NeighborhoodSawyerW 0.229089 0.026704 8.579 < 2e-16 ***
PriceRangeLow:NeighborhoodSawyerW 0.114862 0.058799 1.953 0.050966 .
PriceRangeHigh:NeighborhoodSomerst 0.349811 0.026017 13.446 < 2e-16 ***
PriceRangeMiddle:NeighborhoodSomerst 0.379811 0.023794 15.962 < 2e-16 ***
PriceRangeLow:NeighborhoodSomerst 0.000000 0.000000 NA NA
PriceRangeHigh:NeighborhoodStoneBr 0.462130 0.029674 15.574 < 2e-16 ***
PriceRangeMiddle:NeighborhoodStoneBr 0.000000 0.000000 NA NA
PriceRangeLow:NeighborhoodStoneBr 0.000000 0.000000 NA NA
PriceRangeHigh:NeighborhoodSWISU 0.000000 0.000000 NA NA
PriceRangeMiddle:NeighborhoodSWISU 0.178727 0.036242 4.931 9.16e-07 ***
PriceRangeLow:NeighborhoodSWISU 0.089124 0.042612 2.092 0.036667 *
PriceRangeHigh:NeighborhoodTimber 0.339333 0.028881 11.750 < 2e-16 ***
PriceRangeMiddle:NeighborhoodTimber 0.256708 0.039830 6.445 1.60e-10 ***
PriceRangeLow:NeighborhoodTimber 0.000000 0.000000 NA NA
PriceRangeHigh:NeighborhoodVeenker 0.430892 0.050392 8.551 < 2e-16 ***
PriceRangeMiddle:NeighborhoodVeenker 0.223013 0.063329 3.521 0.000443 ***
PriceRangeLow:NeighborhoodVeenker 0.000000 0.000000 NA NA
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Approximate significance of smooth terms:
edf Ref.df F p-value
s(TotalSF):PriceRangeHigh 1.241 1.446 294.701 < 2e-16 ***
s(TotalSF):PriceRangeMiddle 2.140 2.761 93.705 < 2e-16 ***
s(TotalSF):PriceRangeLow 4.702 5.592 42.010 < 2e-16 ***
s(QualCond):PriceRangeHigh 2.674 3.371 19.324 2.40e-13 ***
s(QualCond):PriceRangeMiddle 4.388 5.219 12.204 7.99e-12 ***
s(QualCond):PriceRangeLow 2.622 3.300 51.032 < 2e-16 ***
s(Age) 5.904 7.059 10.842 1.94e-13 ***
s(MSSubClass) 7.261 8.134 8.372 2.45e-11 ***
s(GarageArea):GarageCars 2.000 2.000 62.900 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Rank: 136/161
R-sq.(adj) = 0.9 Deviance explained = 90.6%
GCV = 0.016601 Scale est. = 0.01561 n = 1456
plot(ames.gam.base)
ord.lm = lm(logPrice ~ OrdMaster:PriceRange, data=ames.test)
plot(ord.lm)
influencePlot(ord.lm)
ames.test = ames %>% mutate(OrdMaster = QualCond*ExterCond*ExterQual)
ord.gam = mgcv::gam(logPrice ~
s(OrdMaster)
+ s(TotalSF, by=PriceRange)
+ Neighborhood,
data=ames.test)
plot(ord.gam)
summary(ord.gam)
Family: gaussian
Link function: identity
Formula:
logPrice ~ s(OrdMaster) + s(TotalSF, by = PriceRange) + Neighborhood
Parametric coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 11.558492 0.007796 1482.549 < 2e-16 ***
NeighborhoodBlmngtn 0.488561 0.035110 13.915 < 2e-16 ***
NeighborhoodBlueste 0.366787 0.098519 3.723 0.000205 ***
NeighborhoodBrDale 0.333030 0.038944 8.551 < 2e-16 ***
NeighborhoodBrkSide 0.352229 0.020852 16.892 < 2e-16 ***
NeighborhoodClearCr 0.550246 0.027602 19.935 < 2e-16 ***
NeighborhoodCollgCr 0.535031 0.013575 39.414 < 2e-16 ***
NeighborhoodCrawfor 0.508258 0.020932 24.282 < 2e-16 ***
NeighborhoodEdwards 0.366981 0.016198 22.657 < 2e-16 ***
NeighborhoodGilbert 0.592288 0.017595 33.663 < 2e-16 ***
NeighborhoodIDOTRR 0.236668 0.027586 8.579 < 2e-16 ***
NeighborhoodMeadowV 0.324627 0.037962 8.551 < 2e-16 ***
NeighborhoodMitchel 0.455044 0.021403 21.261 < 2e-16 ***
NeighborhoodNAmes 0.403216 0.012168 33.139 < 2e-16 ***
NeighborhoodNoRidge 0.595490 0.025708 23.164 < 2e-16 ***
NeighborhoodNPkVill 0.412112 0.047299 8.713 < 2e-16 ***
NeighborhoodNridgHt 0.650982 0.019188 33.926 < 2e-16 ***
NeighborhoodNWAmes 0.445037 0.018211 24.437 < 2e-16 ***
NeighborhoodOldTown 0.259588 0.016026 16.198 < 2e-16 ***
NeighborhoodSawyer 0.398217 0.018231 21.843 < 2e-16 ***
NeighborhoodSawyerW 0.510025 0.019298 26.429 < 2e-16 ***
NeighborhoodSomerst 0.616710 0.017029 36.215 < 2e-16 ***
NeighborhoodStoneBr 0.668427 0.029585 22.594 < 2e-16 ***
NeighborhoodSWISU 0.318565 0.029128 10.937 < 2e-16 ***
NeighborhoodTimber 0.584961 0.023983 24.391 < 2e-16 ***
NeighborhoodVeenker 0.585412 0.043028 13.605 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Approximate significance of smooth terms:
edf Ref.df F p-value
s(OrdMaster) 6.716 7.725 56.53 <2e-16 ***
s(TotalSF):PriceRangeHigh 3.902 4.899 188.03 <2e-16 ***
s(TotalSF):PriceRangeMiddle 2.503 3.180 180.99 <2e-16 ***
s(TotalSF):PriceRangeLow 8.788 8.978 38.75 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Rank: 61/62
R-sq.(adj) = 0.867 Deviance explained = 87.1%
GCV = 0.021533 Scale est. = 0.02084 n = 1456
#plot(ames.lm)
#library(car)
inf = influencePlot(ames.lm)
ames[rownames(inf),] %>% select(c(Age,Neighborhood,QualCond,PriceRange,TotalSF))
rownames(inf)
[1] "218" "564" "575" "804" "900" "963"
ames.filt = ames[as.numeric(rownames(inf)),]
ames %>% anti_join(., ames.filt, by = c("SecFlrSF","Neighborhood","FirstFlrSF"))
## Base GAM Model
ames.gam.base <- mgcv::gam(logPrice
~ s(TotalSF, by=PriceRange)
#+ s(TotalBsmtSF)
+ s(QualCond, by=PriceRange)
+ s(Age)
+ Fireplaces:PriceRange
+ s(MSSubClass)
#+ PriceRange
+ Neighborhood:PriceRange
+ s(GarageArea, by=GarageCars),
#+ FullBath,
method='GCV.Cp',
data=ames)
## Base LM Model
ames.lm <- lm(logPrice
~ TotalSF:PriceRange
#+ TotalBsmtSF
+ QualCond:PriceRange
+ Age
+ Fireplaces:PriceRange
+ MSSubClass
#+ PriceRange
+ Neighborhood:PriceRange
+ GarageArea:GarageCars,
#+ FullBath,
#method='GCV.CP',
data=ames)
true = ames_test[,'SalePrice'][[1]]
# gam.predictions = predict.gam(ames.gam, newdata = ames_test, type = 'response')
# gam.errors = gam.predictions - log(true)
# gam.diff =exp(gam.predictions) - true
gam.base.predictions = predict.gam(ames.gam.base, newdata = ames_test, type = 'response')
gam.base.errors = gam.base.predictions - log(true)
gam.base.diff =exp(gam.base.predictions) - true
lm.predictions = predict(ames.lm, newdata = ames_test)
lm.errors = lm.predictions - log(true)
lm.diff = exp(lm.predictions) - true
print(paste('GAM RMSE:',sqrt(mean(gam.base.errors^2))))
print(paste('LM RMSE:',sqrt(mean(lm.errors^2))))
print(paste('GAM: Error in $ as Predicted:',round(mean(abs(gam.base.diff)),2)))
print(paste('LM: Error in $ as Predicted:',round(mean(abs(lm.diff)),2)))
summary(ames.gam.base)
ames$PriceRange <- factor(ames$PriceRange, levels = c("Low", "Middle", "High"))
ames.gam.test = mgcv::gam(logPrice ~
s(OverallQual, by=PriceRange, bs='gp')
+ s(TotalSF, by=PriceRange, bs='gp'),
method='ML',
data=ames)
test.preds= test %>% mutate(logPrice = predict.gam(ames.gam.test, newdata = test, type = 'response'))
gam_qual_eda = visreg(ames.gam.test, 'OverallQual', partial=TRUE,
#scale='response',
alpha=0.05, gg=TRUE,
line=list(col="red"),
fill=list(fill="pink"),
points=list(size=1, pch=1, alpha=0.4, col='black')) +
theme_bw() +
labs(y = 'Log Sale Price', x = 'Overall Quality') +
geom_smooth(method='lm', color='blue') +
scale_color_manual(values=colors)
ggsave(paste0("./presentation/gam_qual_eda.png"), gam_qual_eda)
Saving 7 x 7 in image
gam_qual_edaP = visreg(ames.gam.test, 'OverallQual', by='PriceRange', partial=TRUE,
#scale='response',
alpha=0.05, gg=TRUE,
line=list(col="red"),
fill=list(fill="pink"),
points=list(size=1, pch=1, alpha=0.4, col='black')) +
ylim(10,15) +
theme_bw() +
labs(y = 'Log Sale Price', x = 'Overall Quality') +
scale_color_manual(values=colors)
ggsave(paste0("./presentation/gam_qual_edaP.png"), gam_qual_edaP)
visreg(ames.gam.test, 'TotalSF', partial=TRUE,
#scale='response',
alpha=0.05, gg=TRUE,
line=list(col="red"),
fill=list(fill="pink"),
points=list(size=1, pch=1, alpha=0.4, col='black')) +
theme_bw() +
labs(y = 'Log Sale Price', x = 'Total Home SF') +
geom_smooth(method='lm', color='blue') +
scale_color_manual(values=colors)
visreg(ames.gam.test, 'TotalSF', by='PriceRange', partial=TRUE,
#scale='response',
alpha=0.05, gg=TRUE,
line=list(col="red"),
fill=list(fill="pink"),
points=list(size=1, pch=1, alpha=0.4, col='black')) +
ylim(10,15) +
theme_bw() +
labs(y = 'Log Sale Price', x = 'Total Home SF') +
scale_color_manual(values=colors)
#visreg(ames.gam, 'MSSubClass',by='PriceRange', overlay=TRUE)
# fit <- lm(log(SalePrice) ~ poly(GrLivArea, 2)*poly(OverallQual, 2), data=ames)
# visreg2d(fit, "GrLivArea", "OverallQual")
#
# visreg(ames.gam, 'TotalBsmtSF',by='HasGarage', overlay=TRUE)
grliv = visreg(ames.gam.base, 'TotalSF', partial=TRUE,
#scale='response',
alpha=0.05, gg=TRUE,
line=list(col="red"),
fill=list(fill="pink"),
points=list(size=1, pch=1, alpha=0.4, col='black')) +
theme_bw() +
labs(y = 'Log Sale Price', x = 'Total Home SF') +
geom_smooth(method='lm', color='blue') +
scale_color_manual(values=colors)
ggsave(paste0("./presentation/gam_grliv.png"), grliv)
grliv_price = visreg(ames.gam.base, 'TotalSF', by='PriceRange', partial=TRUE,
#scale='response',
alpha=0.05, gg=TRUE,
line=list(col="red"),
fill=list(fill="pink"),
points=list(size=1, pch=1, alpha=0.4, col='black')) +
theme_bw() +
labs(y = 'Log Sale Price', x = 'Total Home SF') +
scale_color_manual(values=colors)
ggsave(paste0("./presentation/gam_grliv_price.png"), grliv_price)
# bsmt = visreg(ames.gam.base, 'TotalBsmtSF', partial=TRUE,
# #scale='response',
# alpha=0.05, gg=TRUE,
# line=list(col="red"),
# fill=list(fill="pink"),
# points=list(size=1, pch=1, alpha=0.2, col='black')) +
# theme_bw() +
# labs(y = 'Log Sale Price', x = 'Total Basement SF') +
# geom_smooth(method='lm', color='blue') +
# scale_color_manual(values=colors)
ggsave(paste0("./presentation/gam_bsmt.png"), bsmt)
msub = visreg(ames.gam.base, 'MSSubClass', partial=TRUE,
jitter=TRUE,
alpha=0.05, gg=TRUE,
line=list(col="red"),
fill=list(fill="pink"),
points=list(size=1, pch=1, alpha=0.2, col='black')) +
theme_bw() +
labs(y = 'Log Sale Price', x = 'MSSubClass: Type of Dwelling') +
geom_smooth(method='lm', color='blue') +
scale_color_manual(values=colors)
ggsave(paste0("./presentation/gam_msub.png"), msub)
# cond = visreg(ames.gam.base, 'OverallCond', partial=TRUE,
# #scale='response',
# alpha=0.05, gg=TRUE,
# jitter=TRUE,
# line=list(col="red"),
# fill=list(fill="pink"),
# points=list(size=1, pch=1, alpha=0.2, col='black')) +
# theme_bw() +
# labs(y = 'Log Sale Price', x = 'Overall Condition of Home') +
# geom_smooth(method='lm', color='blue') +
# scale_color_manual(values=colors)
#
#
# ggsave(paste0("./presentation/gam_cond.png"), cond)
garage = visreg(ames.gam.base, 'GarageArea', by='GarageCars', partial=TRUE,
#scale='response',
alpha=0.05, gg=TRUE,
line=list(col="red"),
jitter=TRUE,
fill=list(fill="pink"),
points=list(size=1, pch=1, alpha=0.2, col='black')) +
geom_smooth(method='lm', color='blue') +
theme_bw() +
labs(y = 'Log Sale Price', x = 'Garage Area')
ggsave(paste0("./presentation/gam_garage.png"), garage)
age = visreg(ames.gam.base, 'Age', partial=TRUE,
#scale='response',
alpha=0.05, gg=TRUE,
line=list(col="red"),
fill=list(fill="pink"),
points=list(size=1, pch=1, alpha=0.2, col='black')) +
theme_bw() +
labs(y = 'Log Sale Price', x = 'Age of Home') +
geom_smooth(method='lm', color='blue') +
scale_color_manual(values=colors)
# geom_smooth(method='lm', color='blue') +
# scale_color_manual(values=colors)
ggsave(paste0("./presentation/gam_age.png"), age)
## Base GAM Model
ames.gam.test <- mgcv::gam(logPrice
~ s(QualCond, by=PriceRange)
+ s(TotalSF, by=PriceRange),
method='GCV.Cp',
data=ames)
## Base LM Model
ames.lm.test <- lm(logPrice
~ QualCond:PriceRange
+ TotalSF:PriceRange,
data=ames)
true = ames_test[,'SalePrice'][[1]]
summary(ames.gam.test)
#summary(ames.lm.test)
visreg(ames.gam.base, "QualCond", partial=TRUE,
#scale='response',
breaks = c(1000,2000,3000),
alpha=0.05, gg=TRUE)
visreg(ames.gam.base, "QualCond", by="PriceRange",
scale='response',
alpha=0.05, gg=TRUE) + ylim(10,15)
visreg(ames.gam.base, "PriceRange", by='TotalSF', partial=TRUE,
scale='response',
overlay=TRUE, breaks = c(1000,2000,3000),
alpha=0.05, gg=TRUE)
visreg(ames.lm, "PriceRange", by='TotalSF', partial=TRUE,
scale='response',
breaks = c(1500,2500,3500),
alpha=0.01, gg=TRUE)
visreg(ames.lm, "TotalSF", by='PriceRange', partial=TRUE,
scale='response',
#breaks = c(1500,2500,3500),
alpha=0.01, gg=TRUE)
visreg(ames.gam.base, 'TotalSF',partial=TRUE,
#scale='response',
alpha=0.05, gg=TRUE,
line=list(col="red"),
fill=list(fill="pink"),
points=list(size=1, pch=1, alpha=0.4, col='black')) +
theme_bw() +
geom_smooth(method = 'lm')
labs(y = 'Log Sale Price', x = 'Total Home SF') +
scale_color_manual(values=colors)
visreg(ames.gam.base, 'TotalSF', by='PriceRange', partial=TRUE,
#scale='response',
alpha=0.05, gg=TRUE,
line=list(col="red"),
fill=list(fill="pink"),
points=list(size=1, pch=1, alpha=0.4, col='black')) +
theme_bw() +
labs(y = 'Log Sale Price', x = 'Total Home SF') +
ylim(10,15)+
scale_color_manual(values=colors)
visreg(ames.lm, "PriceRange", by="TotalSF", partial=TRUE,
#scale='response',
overlay=TRUE, breaks = c(1000,2000,3000),
alpha=0.05, gg=TRUE)
visreg(ames.gam.base, "PriceRange", by="TotalSF",, partial=TRUE,
#scale='response',
overlay=TRUE, breaks = c(1000,2000,3000),
alpha=0.05, gg=TRUE)
std = sd(gam.base.errors)
datgam <- data.frame(density = c(gam.base.errors), model = rep(c("GAM")))
datlm <- data.frame(density = c(lm.errors), model = rep(c("LM")))
# dens_comp = ggplot(dat,aes(x = density, fill= model)) +
# # stat_function(fun = dnorm, args=list(0,std), aes(col='Gaussian')) +
# geom_density(alpha=0.5) +
# theme_bw() +
# #scale_fill_brewer(palette="Dark2") +
# labs(x='Residual Magnitude', y='Frequency')
datgam %>%
ggplot(aes(x=density)) +
geom_density()
ggsave(paste0("./presentation/dens_comp.png"), dens_comp)
Saving 7.29 x 4.51 in image
plot(true, exp(gam.base.predictions), ylim=c(0,7e5), xlim=c(0,7e5))
abline(a=0,b=1)
#points(ames_test_[,'SalePrice'][[1]], exp(lm.predictions))
plot(true, exp(lm.predictions), ylim=c(0,7e5), xlim=c(0,7e5))
abline(a=0,b=1)
# gam.dollars = data.frame(cbind(exp(gam.base.predictions)-true, true))
# colnames(gam.dollars) = c('Error','Real')
#
# ggplot(gam.dollars, aes(x=Real, y=Error)) +
# geom_point() +
# theme_bw() +
# ylim(-1e5,1e5) +
# xlim(0,7e5) +
# labs(x = 'True Price [$]',y = 'Prediction Error [$]')
#
# slope = mean(abs(gam.dollars$Error))/mean(gam.dollars$Real)
# #slope = mean(abs(gam.dollars$Error)/gam.dollars$Real)
# ggplot(gam.dollars, aes(x=Real, y=abs(Error))) +
# geom_point() +
# geom_abline(intercept = 0, slope = 3*slope, linetype='dashed') +
# geom_abline(intercept = 0, slope = 2*slope, linetype='dashed') +
# geom_abline(intercept = 0, slope = 1*slope, linetype='dashed') +
# geom_abline(intercept = 0, slope = 0*slope, linetype='dashed') +
# theme_bw() +
# #ylim(-1e5,1e5) +
# xlim(0,7e5) +
# labs(x = 'True Price [$]',y = 'GAM Absolute Prediction Error [$]')
#
# #ggsave(paste0("./presentation/res_gam.png"), res_gam)
gam.test_predictions = predict.gam(ames.gam.base, newdata = ames_test, type = 'response')
lm.test_predictions = predict(ames.lm, newdata = ames_test, type = 'response')
prediction from a rank-deficient fit may be misleading
gam.dollars = data.frame(cbind(gam.test_predictions, log(true), gam.test_predictions-log(true)))
colnames(gam.dollars) = c('Fitted','Real','Error')
gam.dollars$stdError = gam.dollars$Error/sd(gam.dollars$Error)
lm.dollars = data.frame(cbind(lm.test_predictions, log(true), lm.test_predictions-log(true)))
colnames(lm.dollars) = c('Fitted','Real','Error')
lm.dollars$stdError = lm.dollars$Error/sd(lm.dollars$Error)
#slope = mean(abs(gam.dollars$Error))/mean(gam.dollars$Real)
#gam_res =
ggplot(gam.dollars, aes(x=Fitted, y=stdError)) +
geom_point(alpha=0.2) +
geom_smooth() +
theme_bw() +
#ylim(-1e5,1e5) +
#xlim(1,log(7e5)) +
labs(x = 'GAM Fitted Values',y = 'Residual Error')
#ggsave(paste0("./presentation/gam_res.png"), gam_res)
#slope = mean(abs(gam.dollars$Error))/mean(gam.dollars$Real)
#lm_res =
ggplot(lm.dollars, aes(x=Fitted, y=stdError)) +
geom_point(alpha=0.2) +
geom_smooth() +
theme_bw() +
#ylim(-1e5,1e5) +
#xlim(1,log(7e5)) +
labs(x = 'LM Fitted Values',y = 'Residual Error')
#ggsave(paste0("./presentation/lm_res.png"), lm_res)
plot(ames.lm)
not plotting observations with leverage one:
446, 788, 914, 1023
not plotting observations with leverage one:
446, 788, 914, 1023
gam.anal= data.frame(cbind(ames.gam.base$fitted.values, ames.gam.base$residuals))
colnames(gam.anal) = c('Fitted','Error')
gam.anal$stdError = ames.gam.base$residuals/sd(ames.gam.base$residuals)
gam.anal %>%
ggplot(aes(x=Fitted, y=stdError)) +
geom_point(alpha=0.2) +
geom_smooth() +
theme_bw() +
#ylim(-1e5,1e5) +
#xlim(1,log(7e5)) +
labs(x = 'GAM Fitted Values',y = 'Residual Error')
gam.anal %>%
ggplot(aes(x=Fitted, y=stdError^2)) +
geom_point(alpha=0.2) +
geom_smooth() +
theme_bw() +
#ylim(-1e5,1e5) +
#xlim(1,log(7e5)) +
labs(x = 'GAM Fitted Values',y = 'Residual Error')
gam.anal %>%
ggplot(aes(x=stdError)) +
geom_density()
theme_bw()
List of 93
$ line :List of 6
..$ colour : chr "black"
..$ size : num 0.5
..$ linetype : num 1
..$ lineend : chr "butt"
..$ arrow : logi FALSE
..$ inherit.blank: logi TRUE
..- attr(*, "class")= chr [1:2] "element_line" "element"
$ rect :List of 5
..$ fill : chr "white"
..$ colour : chr "black"
..$ size : num 0.5
..$ linetype : num 1
..$ inherit.blank: logi TRUE
..- attr(*, "class")= chr [1:2] "element_rect" "element"
$ text :List of 11
..$ family : chr ""
..$ face : chr "plain"
..$ colour : chr "black"
..$ size : num 11
..$ hjust : num 0.5
..$ vjust : num 0.5
..$ angle : num 0
..$ lineheight : num 0.9
..$ margin : 'margin' num [1:4] 0pt 0pt 0pt 0pt
.. ..- attr(*, "valid.unit")= int 8
.. ..- attr(*, "unit")= chr "pt"
..$ debug : logi FALSE
..$ inherit.blank: logi TRUE
..- attr(*, "class")= chr [1:2] "element_text" "element"
$ title : NULL
$ aspect.ratio : NULL
$ axis.title : NULL
$ axis.title.x :List of 11
..$ family : NULL
..$ face : NULL
..$ colour : NULL
..$ size : NULL
..$ hjust : NULL
..$ vjust : num 1
..$ angle : NULL
..$ lineheight : NULL
..$ margin : 'margin' num [1:4] 2.75pt 0pt 0pt 0pt
.. ..- attr(*, "valid.unit")= int 8
.. ..- attr(*, "unit")= chr "pt"
..$ debug : NULL
..$ inherit.blank: logi TRUE
..- attr(*, "class")= chr [1:2] "element_text" "element"
$ axis.title.x.top :List of 11
..$ family : NULL
..$ face : NULL
..$ colour : NULL
..$ size : NULL
..$ hjust : NULL
..$ vjust : num 0
..$ angle : NULL
..$ lineheight : NULL
..$ margin : 'margin' num [1:4] 0pt 0pt 2.75pt 0pt
.. ..- attr(*, "valid.unit")= int 8
.. ..- attr(*, "unit")= chr "pt"
..$ debug : NULL
..$ inherit.blank: logi TRUE
..- attr(*, "class")= chr [1:2] "element_text" "element"
$ axis.title.x.bottom : NULL
$ axis.title.y :List of 11
..$ family : NULL
..$ face : NULL
..$ colour : NULL
..$ size : NULL
..$ hjust : NULL
..$ vjust : num 1
..$ angle : num 90
..$ lineheight : NULL
..$ margin : 'margin' num [1:4] 0pt 2.75pt 0pt 0pt
.. ..- attr(*, "valid.unit")= int 8
.. ..- attr(*, "unit")= chr "pt"
..$ debug : NULL
..$ inherit.blank: logi TRUE
..- attr(*, "class")= chr [1:2] "element_text" "element"
$ axis.title.y.left : NULL
$ axis.title.y.right :List of 11
..$ family : NULL
..$ face : NULL
..$ colour : NULL
..$ size : NULL
..$ hjust : NULL
..$ vjust : num 0
..$ angle : num -90
..$ lineheight : NULL
..$ margin : 'margin' num [1:4] 0pt 0pt 0pt 2.75pt
.. ..- attr(*, "valid.unit")= int 8
.. ..- attr(*, "unit")= chr "pt"
..$ debug : NULL
..$ inherit.blank: logi TRUE
..- attr(*, "class")= chr [1:2] "element_text" "element"
$ axis.text :List of 11
..$ family : NULL
..$ face : NULL
..$ colour : chr "grey30"
..$ size : 'rel' num 0.8
..$ hjust : NULL
..$ vjust : NULL
..$ angle : NULL
..$ lineheight : NULL
..$ margin : NULL
..$ debug : NULL
..$ inherit.blank: logi TRUE
..- attr(*, "class")= chr [1:2] "element_text" "element"
$ axis.text.x :List of 11
..$ family : NULL
..$ face : NULL
..$ colour : NULL
..$ size : NULL
..$ hjust : NULL
..$ vjust : num 1
..$ angle : NULL
..$ lineheight : NULL
..$ margin : 'margin' num [1:4] 2.2pt 0pt 0pt 0pt
.. ..- attr(*, "valid.unit")= int 8
.. ..- attr(*, "unit")= chr "pt"
..$ debug : NULL
..$ inherit.blank: logi TRUE
..- attr(*, "class")= chr [1:2] "element_text" "element"
$ axis.text.x.top :List of 11
..$ family : NULL
..$ face : NULL
..$ colour : NULL
..$ size : NULL
..$ hjust : NULL
..$ vjust : num 0
..$ angle : NULL
..$ lineheight : NULL
..$ margin : 'margin' num [1:4] 0pt 0pt 2.2pt 0pt
.. ..- attr(*, "valid.unit")= int 8
.. ..- attr(*, "unit")= chr "pt"
..$ debug : NULL
..$ inherit.blank: logi TRUE
..- attr(*, "class")= chr [1:2] "element_text" "element"
$ axis.text.x.bottom : NULL
$ axis.text.y :List of 11
..$ family : NULL
..$ face : NULL
..$ colour : NULL
..$ size : NULL
..$ hjust : num 1
..$ vjust : NULL
..$ angle : NULL
..$ lineheight : NULL
..$ margin : 'margin' num [1:4] 0pt 2.2pt 0pt 0pt
.. ..- attr(*, "valid.unit")= int 8
.. ..- attr(*, "unit")= chr "pt"
..$ debug : NULL
..$ inherit.blank: logi TRUE
..- attr(*, "class")= chr [1:2] "element_text" "element"
$ axis.text.y.left : NULL
$ axis.text.y.right :List of 11
..$ family : NULL
..$ face : NULL
..$ colour : NULL
..$ size : NULL
..$ hjust : num 0
..$ vjust : NULL
..$ angle : NULL
..$ lineheight : NULL
..$ margin : 'margin' num [1:4] 0pt 0pt 0pt 2.2pt
.. ..- attr(*, "valid.unit")= int 8
.. ..- attr(*, "unit")= chr "pt"
..$ debug : NULL
..$ inherit.blank: logi TRUE
..- attr(*, "class")= chr [1:2] "element_text" "element"
$ axis.ticks :List of 6
..$ colour : chr "grey20"
..$ size : NULL
..$ linetype : NULL
..$ lineend : NULL
..$ arrow : logi FALSE
..$ inherit.blank: logi TRUE
..- attr(*, "class")= chr [1:2] "element_line" "element"
$ axis.ticks.x : NULL
$ axis.ticks.x.top : NULL
$ axis.ticks.x.bottom : NULL
$ axis.ticks.y : NULL
$ axis.ticks.y.left : NULL
$ axis.ticks.y.right : NULL
$ axis.ticks.length : 'unit' num 2.75pt
..- attr(*, "valid.unit")= int 8
..- attr(*, "unit")= chr "pt"
$ axis.ticks.length.x : NULL
$ axis.ticks.length.x.top : NULL
$ axis.ticks.length.x.bottom: NULL
$ axis.ticks.length.y : NULL
$ axis.ticks.length.y.left : NULL
$ axis.ticks.length.y.right : NULL
$ axis.line : list()
..- attr(*, "class")= chr [1:2] "element_blank" "element"
$ axis.line.x : NULL
$ axis.line.x.top : NULL
$ axis.line.x.bottom : NULL
$ axis.line.y : NULL
$ axis.line.y.left : NULL
$ axis.line.y.right : NULL
$ legend.background :List of 5
..$ fill : NULL
..$ colour : logi NA
..$ size : NULL
..$ linetype : NULL
..$ inherit.blank: logi TRUE
..- attr(*, "class")= chr [1:2] "element_rect" "element"
$ legend.margin : 'margin' num [1:4] 5.5pt 5.5pt 5.5pt 5.5pt
..- attr(*, "valid.unit")= int 8
..- attr(*, "unit")= chr "pt"
$ legend.spacing : 'unit' num 11pt
..- attr(*, "valid.unit")= int 8
..- attr(*, "unit")= chr "pt"
$ legend.spacing.x : NULL
$ legend.spacing.y : NULL
$ legend.key :List of 5
..$ fill : chr "white"
..$ colour : logi NA
..$ size : NULL
..$ linetype : NULL
..$ inherit.blank: logi TRUE
..- attr(*, "class")= chr [1:2] "element_rect" "element"
$ legend.key.size : 'unit' num 1.2lines
..- attr(*, "valid.unit")= int 3
..- attr(*, "unit")= chr "lines"
$ legend.key.height : NULL
$ legend.key.width : NULL
$ legend.text :List of 11
..$ family : NULL
..$ face : NULL
..$ colour : NULL
..$ size : 'rel' num 0.8
..$ hjust : NULL
..$ vjust : NULL
..$ angle : NULL
..$ lineheight : NULL
..$ margin : NULL
..$ debug : NULL
..$ inherit.blank: logi TRUE
..- attr(*, "class")= chr [1:2] "element_text" "element"
$ legend.text.align : NULL
$ legend.title :List of 11
..$ family : NULL
..$ face : NULL
..$ colour : NULL
..$ size : NULL
..$ hjust : num 0
..$ vjust : NULL
..$ angle : NULL
..$ lineheight : NULL
..$ margin : NULL
..$ debug : NULL
..$ inherit.blank: logi TRUE
..- attr(*, "class")= chr [1:2] "element_text" "element"
$ legend.title.align : NULL
$ legend.position : chr "right"
$ legend.direction : NULL
$ legend.justification : chr "center"
$ legend.box : NULL
$ legend.box.just : NULL
$ legend.box.margin : 'margin' num [1:4] 0cm 0cm 0cm 0cm
..- attr(*, "valid.unit")= int 1
..- attr(*, "unit")= chr "cm"
$ legend.box.background : list()
..- attr(*, "class")= chr [1:2] "element_blank" "element"
$ legend.box.spacing : 'unit' num 11pt
..- attr(*, "valid.unit")= int 8
..- attr(*, "unit")= chr "pt"
$ panel.background :List of 5
..$ fill : chr "white"
..$ colour : logi NA
..$ size : NULL
..$ linetype : NULL
..$ inherit.blank: logi TRUE
..- attr(*, "class")= chr [1:2] "element_rect" "element"
$ panel.border :List of 5
..$ fill : logi NA
..$ colour : chr "grey20"
..$ size : NULL
..$ linetype : NULL
..$ inherit.blank: logi TRUE
..- attr(*, "class")= chr [1:2] "element_rect" "element"
$ panel.spacing : 'unit' num 5.5pt
..- attr(*, "valid.unit")= int 8
..- attr(*, "unit")= chr "pt"
$ panel.spacing.x : NULL
$ panel.spacing.y : NULL
$ panel.grid :List of 6
..$ colour : chr "grey92"
..$ size : NULL
..$ linetype : NULL
..$ lineend : NULL
..$ arrow : logi FALSE
..$ inherit.blank: logi TRUE
..- attr(*, "class")= chr [1:2] "element_line" "element"
$ panel.grid.major : NULL
$ panel.grid.minor :List of 6
..$ colour : NULL
..$ size : 'rel' num 0.5
..$ linetype : NULL
..$ lineend : NULL
..$ arrow : logi FALSE
..$ inherit.blank: logi TRUE
..- attr(*, "class")= chr [1:2] "element_line" "element"
$ panel.grid.major.x : NULL
$ panel.grid.major.y : NULL
$ panel.grid.minor.x : NULL
$ panel.grid.minor.y : NULL
$ panel.ontop : logi FALSE
$ plot.background :List of 5
..$ fill : NULL
..$ colour : chr "white"
..$ size : NULL
..$ linetype : NULL
..$ inherit.blank: logi TRUE
..- attr(*, "class")= chr [1:2] "element_rect" "element"
$ plot.title :List of 11
..$ family : NULL
..$ face : NULL
..$ colour : NULL
..$ size : 'rel' num 1.2
..$ hjust : num 0
..$ vjust : num 1
..$ angle : NULL
..$ lineheight : NULL
..$ margin : 'margin' num [1:4] 0pt 0pt 5.5pt 0pt
.. ..- attr(*, "valid.unit")= int 8
.. ..- attr(*, "unit")= chr "pt"
..$ debug : NULL
..$ inherit.blank: logi TRUE
..- attr(*, "class")= chr [1:2] "element_text" "element"
$ plot.title.position : chr "panel"
$ plot.subtitle :List of 11
..$ family : NULL
..$ face : NULL
..$ colour : NULL
..$ size : NULL
..$ hjust : num 0
..$ vjust : num 1
..$ angle : NULL
..$ lineheight : NULL
..$ margin : 'margin' num [1:4] 0pt 0pt 5.5pt 0pt
.. ..- attr(*, "valid.unit")= int 8
.. ..- attr(*, "unit")= chr "pt"
..$ debug : NULL
..$ inherit.blank: logi TRUE
..- attr(*, "class")= chr [1:2] "element_text" "element"
$ plot.caption :List of 11
..$ family : NULL
..$ face : NULL
..$ colour : NULL
..$ size : 'rel' num 0.8
..$ hjust : num 1
..$ vjust : num 1
..$ angle : NULL
..$ lineheight : NULL
..$ margin : 'margin' num [1:4] 5.5pt 0pt 0pt 0pt
.. ..- attr(*, "valid.unit")= int 8
.. ..- attr(*, "unit")= chr "pt"
..$ debug : NULL
..$ inherit.blank: logi TRUE
..- attr(*, "class")= chr [1:2] "element_text" "element"
$ plot.caption.position : chr "panel"
$ plot.tag :List of 11
..$ family : NULL
..$ face : NULL
..$ colour : NULL
..$ size : 'rel' num 1.2
..$ hjust : num 0.5
..$ vjust : num 0.5
..$ angle : NULL
..$ lineheight : NULL
..$ margin : NULL
..$ debug : NULL
..$ inherit.blank: logi TRUE
..- attr(*, "class")= chr [1:2] "element_text" "element"
$ plot.tag.position : chr "topleft"
$ plot.margin : 'margin' num [1:4] 5.5pt 5.5pt 5.5pt 5.5pt
..- attr(*, "valid.unit")= int 8
..- attr(*, "unit")= chr "pt"
$ strip.background :List of 5
..$ fill : chr "grey85"
..$ colour : chr "grey20"
..$ size : NULL
..$ linetype : NULL
..$ inherit.blank: logi TRUE
..- attr(*, "class")= chr [1:2] "element_rect" "element"
$ strip.background.x : NULL
$ strip.background.y : NULL
$ strip.placement : chr "inside"
$ strip.text :List of 11
..$ family : NULL
..$ face : NULL
..$ colour : chr "grey10"
..$ size : 'rel' num 0.8
..$ hjust : NULL
..$ vjust : NULL
..$ angle : NULL
..$ lineheight : NULL
..$ margin : 'margin' num [1:4] 4.4pt 4.4pt 4.4pt 4.4pt
.. ..- attr(*, "valid.unit")= int 8
.. ..- attr(*, "unit")= chr "pt"
..$ debug : NULL
..$ inherit.blank: logi TRUE
..- attr(*, "class")= chr [1:2] "element_text" "element"
$ strip.text.x : NULL
$ strip.text.y :List of 11
..$ family : NULL
..$ face : NULL
..$ colour : NULL
..$ size : NULL
..$ hjust : NULL
..$ vjust : NULL
..$ angle : num -90
..$ lineheight : NULL
..$ margin : NULL
..$ debug : NULL
..$ inherit.blank: logi TRUE
..- attr(*, "class")= chr [1:2] "element_text" "element"
$ strip.switch.pad.grid : 'unit' num 2.75pt
..- attr(*, "valid.unit")= int 8
..- attr(*, "unit")= chr "pt"
$ strip.switch.pad.wrap : 'unit' num 2.75pt
..- attr(*, "valid.unit")= int 8
..- attr(*, "unit")= chr "pt"
$ strip.text.y.left :List of 11
..$ family : NULL
..$ face : NULL
..$ colour : NULL
..$ size : NULL
..$ hjust : NULL
..$ vjust : NULL
..$ angle : num 90
..$ lineheight : NULL
..$ margin : NULL
..$ debug : NULL
..$ inherit.blank: logi TRUE
..- attr(*, "class")= chr [1:2] "element_text" "element"
- attr(*, "class")= chr [1:2] "theme" "gg"
- attr(*, "complete")= logi TRUE
- attr(*, "validate")= logi TRUE
gam.anal %>%
ggplot(aes(x = sample(1:nrow(gam.anal)), y = stdError)) +
geom_point(alpha=0.5) +
geom_hline(yintercept=qt(0.025, df = nrow(gam.anal) - 2), color = "red", linetype="dashed", color = "red") +
geom_hline(yintercept=qt(1 - 0.025, df = nrow(gam.anal) - 2), color = "red", linetype="dashed", color = "red") +
labs(x = 'Train Set Index', y = 'Standardized Residuals') +
theme_bw()
Duplicated aesthetics after name standardisation: colourDuplicated aesthetics after name standardisation: colour
lm.anal= data.frame(cbind(ames.lm$fitted.values, ames.lm$residuals))
colnames(lm.anal) = c('Fitted','Error')
lm.anal$stdError = ames.lm$residuals/sd(ames.lm$residuals)
lm.anal %>%
ggplot(aes(x=Fitted, y=stdError)) +
geom_point(alpha=0.2) +
geom_smooth() +
theme_bw() +
labs(x = 'LM Fitted Values',y = 'Residual Error')
lm.anal %>%
ggplot(aes(x = sample(1:nrow(lm.anal)), y = stdError)) +
geom_point(alpha=0.5) +
geom_hline(yintercept=qt(0.025, df = nrow(lm.anal) - 2), color = "red", linetype="dashed", color = "red") +
geom_hline(yintercept=qt(1 - 0.025, df = nrow(lm.anal) - 2), color = "red", linetype="dashed", color = "red") +
labs(x = 'Train Set Index', y = 'Standardized Residuals') +
theme_bw()
Duplicated aesthetics after name standardisation: colourDuplicated aesthetics after name standardisation: colour
lm.dollars = data.frame(cbind(exp(lm.predictions)-true, true))
colnames(lm.dollars) = c('Error','Real')
ggplot(lm.dollars, aes(x=Real, y=Error)) +
geom_point() +
theme_bw() +
ylim(-1e5,1e5) +
xlim(0,7e5) +
labs(x = 'True Price [$]',y = 'Prediction Error [$]')
slope = mean(abs(lm.dollars$Error))/mean(lm.dollars$Real)
#slope = mean(abs(gam.dollars$Error)/gam.dollars$Real)
res_lm = ggplot(lm.dollars, aes(x=Real, y=abs(Error))) +
geom_point() +
geom_abline(intercept = 0, slope = 3*slope, linetype='dashed') +
geom_abline(intercept = 0, slope = 2*slope, linetype='dashed') +
geom_abline(intercept = 0, slope = 1*slope, linetype='dashed') +
geom_abline(intercept = 0, slope = 0*slope, linetype='dashed') +
theme_bw() +
#ylim(-1e5,1e5) +
xlim(0,7e5) +
labs(x = 'True Price [$]',y = 'LM Absolute Prediction Error [$]')
ggsave(paste0("./presentation/res_lm.png"), res_lm)
lm.dollars = data.frame(cbind(lm.predictions-log(true), log(true)))
colnames(lm.dollars) = c('Error','Real')
ggplot(lm.dollars, aes(x=Real, y=Error)) +
geom_point() +
theme_bw() +
#ylim(-1e5,1e5) +
#xlim(0,7e5) +
labs(x = 'Log True Price',y = 'LM Prediction Error')
slope = mean(abs(lm.dollars$Error))/mean(lm.dollars$Real)
#slope = mean(abs(gam.dollars$Error)/gam.dollars$Real)
log_res_lm = ggplot(lm.dollars, aes(x=Real, y=abs(Error))) +
geom_point() +
geom_abline(intercept = 0, slope = 3*slope, linetype='dashed') +
geom_abline(intercept = 0, slope = 2*slope, linetype='dashed') +
geom_abline(intercept = 0, slope = 1*slope, linetype='dashed') +
geom_abline(intercept = 0, slope = 0*slope, linetype='dashed') +
theme_bw() +
#ylim(-1e5,1e5) +
#xlim(1,log(7e5)) +
labs(x = 'Log True Price',y = 'LM Absolute Prediction Error')
ggsave(paste0("./presentation/log_res_lm.png"), log_res_lm)
ames.gam <- mgcv::gam(log(SalePrice)
~ s(GrLivArea, by=PriceRange, bs='cs', id=1)
+ s(YearBuilt)
+ ti(OverallQual,OverallCond)
+ s(MSSubClass)
+ s(YearBuilt)
+ CentralAir
,method='GCV.Cp', data=ames_train, gamma=1.4, select=TRUE)
visreg(ames.gam)
true = ames_test[,'SalePrice'][[1]]
gam.predictions = predict.gam(ames.gam, newdata = ames_test, type = 'response')
gam.errors = gam.predictions - log(true)
gam.diff =exp(gam.predictions) - true
plot(density(gam.errors), col ='red')
lines(density(gam.base.errors), col='black', lty=2)
#plot(ames.gam)
legend("topright",legend=c('GAM Residuals','Base GAM Residuals'),
col=c("red","black"), lty=1:2, cex=0.8)
print(paste('GAM RMSE:',sqrt(mean(gam.errors^2))))
print(paste('GAM: Error in $ as Predicted:',round(mean(abs(gam.diff)),2)))
#plot(ames_test[,'SalePrice'][[1]], gam.errors)
plot(true, exp(gam.predictions), ylim=c(0,7e5), xlim=c(0,7e5))
abline(a=0,b=1)
summary(ames.gam)
gam.test_predictions = predict.gam(ames.gam.base, newdata = test, type = 'response')
submission = data.frame(exp(gam.test_predictions))
colnames(submission) = 'SalePrice'
submission = tibble::rowid_to_column(submission,'Id')
rownames(submission) = 1461:(nrow(submission)+1460)
gmb_submission = mutate(submission, Id = Id + 1460)
lm.test_predictions = predict(ames.lm, newdata = test, type = 'response')
lm_submission = data.frame(exp(lm.test_predictions))
colnames(lm_submission) = 'SalePrice'
lm_submission = tibble::rowid_to_column(lm_submission,'Id')
rownames(lm_submission) = 1461:(nrow(lm_submission)+1460)
lm_submission = mutate(lm_submission, Id = Id + 1460)
#write.csv(gam.test_predictions,'./data/price_predictions.csv')
# write.table(gm_submission,file="./data/gm_predictions.csv",col.names = c("Id","SalePrice"),sep = ",",row.names = F)
write.table(lm_submission,file="./data/lm_predictions.csv",col.names = c("Id","SalePrice"),sep = ",",row.names = F)
write.table(gmb_submission,file="./data/gmb_predictions.csv",col.names = c("Id","SalePrice"),sep = ",",row.names = F)
write.table((lm_submission+gmb_submission)/2,file="./data/avg_predictions.csv",col.names = c("Id","SalePrice"),sep = ",",row.names = F)
dim_pred = read_csv('../Dmitri/predictions_1590435442.csv')
Parsed with column specification:
cols(
Id = [32mcol_double()[39m,
SalePrice = [32mcol_double()[39m
)
test.preds = test %>% mutate(preds = predict.gam(ames.gam.base, newdata = test, type = 'response'), logpreds = log(preds))
test.preds.lm = test %>% mutate(preds = predict(ames.lm, newdata = test, type = 'response'), logpreds = log(preds))
prediction from a rank-deficient fit may be misleading
test.preds.dim = test %>% mutate(preds = dim_pred$SalePrice, logpreds = log(preds))
test.preds$PriceRange <- factor(test.preds$PriceRange, levels = c("High", "Middle", "Low"))
preds_unstruct =
test.preds %>%
ggplot(aes(x=1:nrow(test.preds), y = preds, col=PriceRange)) +
geom_point() +
labs(x = 'Test Set Index', y = 'Price Predictions') +
theme_bw()
ggsave(paste0("./presentation/preds_gam/unstruct.png"), preds_unstruct)
Saving 7 x 7 in image
preds_struct =
test.preds %>%
arrange(preds) %>%
ggplot(aes(x=1:nrow(test.preds), y = preds, col=PriceRange)) +
geom_point() +
labs(x = 'Test Set Index', y = 'Price Predictions') +
theme_bw()
ggsave(paste0("./presentation/preds_gam/struct.png"), preds_struct)
preds_struct_log =
test.preds %>%
arrange(logpreds) %>%
ggplot(aes(x=1:nrow(test.preds), y = logpreds, col=PriceRange)) +
geom_point() +
labs(x = 'Test Set Index', y = 'Log Price Predictions') +
theme_bw()
ggsave(paste0("./presentation/preds_gam/struct_log.png"), preds_struct_log)
preds_binned =
test.preds %>%
arrange(logpreds) %>%
ggplot(aes(x=1:nrow(test.preds), y = logpreds, col=PriceRange)) +
geom_point() +
facet_wrap('PriceRange') +
labs(x = 'Test Set Index', y = 'Log Price Predictions') +
theme_bw()
ggsave(paste0("./presentation/preds_gam/binned.png"), preds_binned)
quants =
ggplot(test.preds, aes(sample = logpreds, colour = PriceRange)) +
stat_qq() +
stat_qq_line()
ggsave(paste0("./presentation/preds_gam/quants.png"), quants)
test.preds.lm$PriceRange <- factor(test.preds.lm$PriceRange, levels = c("High", "Middle", "Low"))
preds_unstruct =
test.preds.lm %>%
ggplot(aes(x=1:nrow(test.preds), y = preds, col=PriceRange)) +
geom_point() +
labs(x = 'Test Set Index', y = 'Price Predictions') +
theme_bw()
ggsave(paste0("./presentation/preds_lm/unstruct.png"), preds_unstruct)
Saving 7 x 7 in image
preds_struct =
test.preds.lm %>%
arrange(preds) %>%
ggplot(aes(x=1:nrow(test.preds), y = preds, col=PriceRange)) +
geom_point() +
labs(x = 'Test Set Index', y = 'Price Predictions') +
theme_bw()
ggsave(paste0("./presentation/preds_lm/truct.png"), preds_struct)
preds_struct_log =
test.preds.lm %>%
arrange(logpreds) %>%
ggplot(aes(x=1:nrow(test.preds), y = logpreds, col=PriceRange)) +
geom_point() +
labs(x = 'Test Set Index', y = 'Log Price Predictions') +
theme_bw()
ggsave(paste0("./presentation/preds_lm/struct_log.png"), preds_struct_log)
preds_binned =
test.preds.lm %>%
arrange(logpreds) %>%
ggplot(aes(x=1:nrow(test.preds), y = logpreds, col=PriceRange)) +
geom_point() +
facet_wrap('PriceRange') +
labs(x = 'Test Set Index', y = 'Log Price Predictions') +
theme_bw()
ggsave(paste0("./presentation/preds_lm/binned.png"), preds_binned)
quants =
ggplot(test.preds.lm, aes(sample = logpreds, colour = PriceRange)) +
stat_qq() +
stat_qq_line()
ggsave(paste0("./presentation/preds_lm/quants.png"), quants)
test.preds.dim$PriceRange <- factor(test.preds.dim$PriceRange, levels = c("High", "Middle", "Low"))
preds_unstruct =
test.preds.dim %>%
ggplot(aes(x=1:nrow(test.preds), y = preds, col=PriceRange)) +
geom_point() +
labs(x = 'Test Set Index', y = 'Price Predictions') +
theme_bw()
ggsave(paste0("./presentation/preds_dim/unstruct.png"), preds_unstruct)
Saving 7 x 7 in image
preds_struct =
test.preds.dim %>%
arrange(preds) %>%
ggplot(aes(x=1:nrow(test.preds), y = preds, col=PriceRange)) +
geom_point() +
labs(x = 'Test Set Index', y = 'Price Predictions') +
theme_bw()
ggsave(paste0("./presentation/preds_dim/truct.png"), preds_struct)
preds_struct_log =
test.preds.dim %>%
arrange(log(preds)) %>%
ggplot(aes(x=1:nrow(test.preds), y = logpreds, col=PriceRange)) +
geom_point() +
labs(x = 'Test Set Index', y = 'Log Price Predictions') +
theme_bw()
ggsave(paste0("./presentation/preds_dim/struct_log.png"), preds_struct_log)
preds_binned =
test.preds.dim %>%
arrange(log(preds)) %>%
ggplot(aes(x=1:nrow(test.preds), y = logpreds, col=PriceRange)) +
geom_point() +
facet_wrap('PriceRange') +
labs(x = 'Test Set Index', y = 'Log Price Predictions') +
theme_bw()
ggsave(paste0("./presentation/preds_dim/binned.png"), preds_binned)
quants =
ggplot(test.preds.dim, aes(sample = logpreds, colour = PriceRange)) +
stat_qq() +
labs(y = 'Log Price') +
stat_qq_line()
ggsave(paste0("./presentation/preds_dim/quants.png"), quants)
stat_box_data <- function(y, upper_limit = max(ames$SalePrice) * 1.15) {
return(
data.frame(
y = 0.98 * upper_limit,
label = paste('c\n', length(y), '\n')
)
)
}
g = ames %>%
ggplot(aes(x = reorder(Neighborhood,SalePrice, mean), y = SalePrice)) +
geom_boxplot() +
stat_summary(
fun.data = stat_box_data,
geom = "text",
hjust = 0.5,
vjust = 0.9
) +
theme_bw() +
theme(axis.text.x=element_text(angle=45, hjust=1)) +
labs(x='Neighborhood', y = 'SalePrice')
ggsave(paste0("./Linear Model/hood_price.png"), g)
min_n = 20
# ames_train %>%
# group_by(Neighborhood, OverallCond) %>%
# tally() %>%
# filter(n > min_n) %>%
# ggplot(aes(x=Neighborhood, y=OverallCond)) +
# geom_point() +
# geom_text(color='red',size=4,aes(y=OverallCond+0.2, label=n)) +
# theme_bw() +
# theme(axis.text.x=element_text(angle=45, hjust=1))
#
# ames_train %>%
# group_by(MSSubClass, OverallQual) %>%
# tally() %>%
# filter(n > min_n) %>%
# ggplot(aes(x=factor(MSSubClass), y=OverallQual)) +
# geom_point() +
# geom_text(color='red',size=4,aes(y=OverallQual+0.2, label=n)) +
# theme_bw() +
# theme(axis.text.x=element_text(angle=45, hjust=1))
#
#
# ames_train %>%
# group_by(HouseStyle, OverallQual) %>%
# tally() %>%
# filter(n > min_n) %>%
# ggplot(aes(x=factor(HouseStyle), y=OverallQual)) +
# geom_point() +
# geom_text(color='red',size=4,aes(y=OverallQual+0.2, label=n)) +
# theme_bw() +
# theme(axis.text.x=element_text(angle=45, hjust=1))
# Zoning relationship wrt Overall Quality of the Home
# Floating Village Residential has score >= 6
# Commercial zones have scores <= 6
# Low density residential spans all scores
# medium density tends towards higher scores
# ames_train %>%
# group_by(MSZoning, OverallQual) %>%
# tally() %>%
# filter(n > min_n) %>%
# ggplot(aes(x=factor(MSZoning), y=OverallQual)) +
# geom_point() +
# geom_text(color='red',size=4,aes(y=OverallQual+0.2, label=n)) +
# theme_bw() +
# theme(axis.text.x=element_text(angle=45, hjust=1))
#
# ames_train %>%
# group_by(SaleType, OverallQual) %>%
# tally() %>%
# filter(n > min_n) %>%
# ggplot(aes(x=factor(SaleType), y=OverallQual)) +
# geom_point() +
# geom_text(color='red',size=4,aes(y=OverallQual+0.2, label=n)) +
#
# theme_bw() +
# theme(axis.text.x=element_text(angle=45, hjust=1))
#
# ames %>%
# group_by(SaleType, OverallCond) %>%
# tally() %>%
# filter(n > min_n) %>%
# ggplot(aes(x=factor(SaleType), y=OverallCond)) +
# geom_point() +
# geom_text(color='red',size=4,aes(y=OverallCond+0.2, label=n)) +
# theme_bw() +
# theme(axis.text.x=element_text(angle=45, hjust=1))
#
# ames %>%
# group_by(GarageType, GarageCond) %>%
# tally() %>%
# filter(n > min_n) %>%
# ggplot(aes(x=factor(GarageType), y=GarageCond)) +
# geom_point() +
# geom_text(color='red',size=4,aes(y=GarageCond+0.2, label=n)) +
# theme_bw() +
# theme(axis.text.x=element_text(angle=45, hjust=1))
nQ_test = test %>%
group_by(Neighborhood, OverallQual) %>%
tally() %>%
filter(n > min_n) %>%
ggplot(aes(x=reorder(Neighborhood,OverallQual,max), y=OverallQual)) +
geom_point() +
geom_text(color='red',size=4,aes(y=OverallQual+0.2, label=n)) +
theme_bw() +
labs(x = 'Neighborhood (Test Set)') +
theme(axis.text.x=element_text(angle=45, hjust=1))
ggsave(paste0("./Linear Model/nQ_test.png"), nQ_test)
nC_test = test %>%
group_by(Neighborhood, OverallCond) %>%
tally() %>%
filter(n > min_n) %>%
ggplot(aes(x=reorder(Neighborhood,OverallCond,max), y=OverallCond)) +
geom_point() +
geom_text(color='red',size=4,aes(y=OverallCond+0.2, label=n)) +
theme_bw() +
labs(x = 'Neighborhood (Test Set)') +
theme(axis.text.x=element_text(angle=45, hjust=1))
ggsave(paste0("./Linear Model/nC_test.png"), nC_test)
nQ = ames %>%
group_by(Neighborhood, OverallQual) %>%
tally() %>%
filter(n > min_n) %>%
ggplot(aes(x=reorder(Neighborhood,OverallQual,max), y=OverallQual)) +
geom_point() +
geom_text(color='red',size=4,aes(y=OverallQual+0.2, label=n)) +
theme_bw() +
labs(x = 'Neighborhood (Training Set)') +
theme(axis.text.x=element_text(angle=45, hjust=1))
ggsave(paste0("./Linear Model/nQ.png"), nQ)
nC = ames %>%
group_by(Neighborhood, OverallCond) %>%
tally() %>%
filter(n > min_n) %>%
ggplot(aes(x=reorder(Neighborhood,OverallCond,max), y=OverallCond)) +
geom_point() +
geom_text(color='red',size=4,aes(y=OverallCond+0.2, label=n)) +
theme_bw() +
labs(x = 'Neighborhood (Training Set)') +
theme(axis.text.x=element_text(angle=45, hjust=1))
ggsave(paste0("./Linear Model/nC.png"), nC)
ames %>%
group_by(OverallCond, OverallQual) %>%
tally() %>%
filter(n > min_n) %>%
ggplot(aes(x=OverallCond, y=OverallQual)) +
geom_point() +
geom_text(color='red',size=4,aes(y=OverallQual+0.2, label=n)) +
theme_bw() +
theme(axis.text.x=element_text(angle=45, hjust=1))
ames %>%
group_by(OverallQual, PriceRange) %>%
tally() %>%
filter(n > min_n) %>%
ggplot(aes(y=OverallQual, x = PriceRange)) +
geom_point() +
geom_text(color='red',size=4,aes(y=OverallQual+0.5, label=n)) +
theme_bw() +
theme(axis.text.x=element_text(angle=45, hjust=1))
ames %>%
group_by(OverallCond, PriceRange) %>%
tally() %>%
filter(n > min_n) %>%
ggplot(aes(y=OverallCond, x = PriceRange)) +
geom_point() +
geom_text(color='red',size=4,aes(y=OverallCond+0.5, label=n)) +
theme_bw() +
theme(axis.text.x=element_text(angle=45, hjust=1))
#ames$PriceRange =ames$SalePrice %>% cut_number(5, labels=c('Cheap','Lower Middle', 'Middle','Upper Middle','Expensive'))
#ames %>% mutate(PriceRange = ifelse(Neighborhood %in% medians$Neighborhood, factor(medians$PriceRange),0)) %>% select(c(Neighborhood,PriceRange))
#ames = ames %>% left_join(., medians[,c('Neighborhood','PriceRange')], by='Neighborhood')
ames %>%
group_by(Neighborhood, PriceRange, SalePrice) %>%
summarise(med = median(SalePrice), n = n()) %>%
filter(n > min_n) %>%
ggplot(aes(x=reorder(Neighborhood,n,max), y = PriceRange)) +
geom_point(color='white') +
geom_text(color='red',size=4,aes(y=PriceRange, label=n)) +
theme_bw() +
theme(axis.text.x=element_text(angle=45, hjust=1))
ames %>%
ggplot(aes(x=PriceRange, y = SalePrice)) +
geom_boxplot() +
theme_bw() +
theme(axis.text.x=element_text(angle=45, hjust=1))
ames %>%
group_by(Neighborhood, ExterCond) %>%
tally() %>%
filter(n > 20) %>%
ggplot(aes(x=Neighborhood, y=ExterCond)) +
geom_point() +
geom_text(color='red',size=4,aes(y=ExterCond+0.2, label=n)) +
theme_bw() +
theme(axis.text.x=element_text(angle=45, hjust=1))
test %>%
group_by(ExterQual, ExterCond) %>%
tally() %>%
ggplot(aes(x=ExterQual, y=ExterCond)) +
geom_point() +
geom_text(color='red',size=4,aes(y=ExterCond+0.2, label=n)) +
theme_bw() +
theme(axis.text.x=element_text(angle=45, hjust=1))
test %>%
group_by(Neighborhood, ExterQual) %>%
tally() %>%
filter(n > 20) %>%
ggplot(aes(x=reorder(Neighborhood,ExterQual,max), y=ExterQual)) +
geom_point() +
geom_text(color='red',size=4,aes(y=ExterQual+0.2, label=n)) +
theme_bw() +
theme(axis.text.x=element_text(angle=45, hjust=1))
ames %>% transmute(PopNeigh = ifelse(Neighborhood %in% c('NAmes','CollgCr'),1,0))
stat_box_data <- function(y, upper_limit = max(ames$SalePrice) * 1.15) {
return(
data.frame(
y = 0.98 * upper_limit,
label = paste('count\n', length(y), '\n')
)
)
}
ames %>%
ggplot(aes(x=reorder(SaleType,SalePrice,median), y = SalePrice)) +
geom_boxplot() +
stat_summary(
fun.data = stat_box_data,
geom = "text",
hjust = 0.5,
vjust = 0.9
) +
theme_bw() +
labs(x = 'Sale Type', y = 'Sale Price')
theme(axis.text.x=element_text(angle=45, hjust=1))
ames %>%
ggplot(aes(y = SalePrice, x = OverallQual, col = SaleType)) +
geom_point() +
theme_bw()
ames %>%
ggplot(aes(y = SalePrice, x = OverallCond, col = SaleType)) +
geom_point() +
theme_bw()
ames %>%
ggplot(aes(x = LotArea, y = OverallQual, col = PriceRange)) +
geom_point() +
theme_bw()
ames %>%
ggplot(aes(x = GrLivArea, y = OverallQual, col = PriceRange)) +
geom_point() +
theme_bw()
ames %>%
ggplot(aes(x = SecFlrSF, y = OverallQual, col = PriceRange)) +
geom_point() +
theme_bw()
model.nointeraction = lm(SalePrice ~ ExterCond + sqrt(GarageArea), data=ames_train_)
model.interaction = lm(SalePrice ~ ExterCond:sqrt(GarageArea), data=ames_train_)
anova(model.nointeraction, model.interaction)
test_interaction = function(feature1, feature2) {
model.nointeraction = lm(SalePrice ~ GrLivArea + feature1 + feature2, data=ames_train)
model.interaction = lm(SalePrice ~ GrLivArea + feature1:feature2, data=ames_train)
#print(summary(model.interaction))
print(anova(model.nointeraction, model.interaction))
}
#test_interaction(ames_train$OverallCond, ames_train$Neighborhood)
#test_interaction(ames_train$OverallQual, ames_train$Neighborhood)
a = test_interaction(ames_train$Neighborhood, ames_train$OverallCond)
test_interaction(ames_train$ExterCond, sqrt(ames_train$GarageArea))
library(ggfortify)
require(gridExtra)
plot_numerical_bands = function(df, x_name, y_name) {
x = df[,x_name][[1]]
y = df[,y_name][[1]]
model = lm(y ~ x, data=df)
predicted = predict(model)
g = ggplot(df, aes(x,y)) +
geom_smooth(method = "lm", se = FALSE, color = "lightgrey") +
geom_segment(aes(xend = x, yend=predicted), alpha=0.2) +
geom_point(aes(color=abs(model$residuals))) +
scale_color_continuous(low = "black", high = "red") +
guides(color=FALSE) +
geom_point(aes(y=predicted),shape=1) +
labs(x = x_name, y = y_name) +
theme_bw()
# Construction of Confidence Intervals
clower = predict(model, interval='confidence', level=0.95)[,2]
cupper = predict(model, interval='confidence', level=0.95)[,3]
plower = predict(model, interval='prediction', level=0.95)[,2]
pupper = predict(model, interval='prediction', level=0.95)[,3]
g2 = g + geom_ribbon(aes(ymin = clower, ymax = cupper), alpha=0.3, col = 'red') +
geom_ribbon(aes(ymin=plower, ymax = pupper), alpha=0.2, col='blue')
model_name = paste0(y_name,'~',x_name)
dir.create(paste0("./Linear Model/",model_name))
# Save Scatter Plot
ggsave(paste0("./Linear Model/",model_name,"/scatter_plot.pdf"), g2)
# Save Diagnostic PLots
pdf(paste0("./Linear Model/",model_name,"/residual_plots.pdf"))
par(mfrow=c(2,2))
plot(model,which=c(1,2,4,6))
dev.off()
# Save Influence Plot
pdf(paste0("./Linear Model/",model_name,"/influence.pdf"))
inf = influencePlot(model)
dev.off()
# plot(x,
# abs(model$residuals),
# xlab = x_name,
# ylab = 'Residual Values Squared',
# main = 'Squared Residuals compared to MSE')
# abline(h=sqrt(mean(model$residuals^2)), lty = 2, lwd=3, col = 'red')
# ss = smooth.spline(x, model$residuals^2, cv = TRUE)
# lines(ss, lwd=2, col='blue')
# abline(h=mean(ss$y), lwd=2, col='black')
# legend("topleft",
# c("MSE", "Spline", "Spline Mean"),
# lty = c(2, 1, 1),
# col = c("red", "blue", "black"))
}
require(gridExtra)
plot_cat_bands = function(df, x_name, y_name) {
x = df[,x_name][[1]]
y = df[,y_name][[1]]
model = lm(y ~ x, data=df)
predicted = predict(model)
g = ggplot(df, aes(x,y)) +
geom_smooth(method = "lm", se = FALSE, color = "lightgrey") +
geom_segment(aes(xend = x, yend=predicted), alpha=0.2) +
geom_point(aes(color=abs(model$residuals))) +
scale_color_continuous(low = "black", high = "red") +
guides(color=FALSE) +
geom_point(aes(y=predicted),shape=1) +
labs(x = x_name, y = y_name) +
theme_bw()
# Construction of Confidence Intervals
clower = predict(model, interval='confidence', level=0.95)[,2]
cupper = predict(model, interval='confidence', level=0.95)[,3]
plower = predict(model, interval='prediction', level=0.95)[,2]
pupper = predict(model, interval='prediction', level=0.95)[,3]
g2 = g + geom_ribbon(aes(ymin = clower, ymax = cupper), alpha=0.3, col = 'red') +
geom_ribbon(aes(ymin=plower, ymax = pupper), alpha=0.2, col='blue')
model_name = paste0(y_name,'~',x_name)
dir.create(paste0("./Linear Model/",model_name))
# Save Scatter Plot
ggsave(paste0("./Linear Model/",model_name,"/scatter_plot.pdf"), g2)
# Save Diagnostic PLots
pdf(paste0("./Linear Model/",model_name,"/residual_plots.pdf"))
par(mfrow=c(2,2))
plot(model,which=c(1,2,4,6))
dev.off()
# Save Influence Plot
pdf(paste0("./Linear Model/",model_name,"/influence.pdf"))
inf = influencePlot(model)
dev.off()
# plot(x,
# abs(model$residuals),
# xlab = x_name,
# ylab = 'Residual Values Squared',
# main = 'Squared Residuals compared to MSE')
# abline(h=sqrt(mean(model$residuals^2)), lty = 2, lwd=3, col = 'red')
# ss = smooth.spline(x, model$residuals^2, cv = TRUE)
# lines(ss, lwd=2, col='blue')
# abline(h=mean(ss$y), lwd=2, col='black')
# legend("topleft",
# c("MSE", "Spline", "Spline Mean"),
# lty = c(2, 1, 1),
# col = c("red", "blue", "black"))
}
sf_bins = rbin_manual(ames_train, SalePrice, TotalSF, c(1e2,1e3,1e4,1e5,1e6))
plot(sf_bins)
ames_train %>%
mutate(BedroomAbvGr = as.factor(BedroomAbvGr)) %>%
group_by(BedroomAbvGr) %>%
ggplot(aes(x = GrLivArea, y = SalePrice)) + geom_point() +
geom_smooth(method = "lm", se = TRUE, color = "red") +
facet_wrap(BedroomAbvGr~.)
feature = 'Alley'
ames_train %>%
mutate(feature = as.factor(feature)) %>%
group_by(feature) %>%
ggplot(aes(x = TotalSF, y = SalePrice)) + geom_point() +
geom_smooth(method = "lm", se = TRUE, color = "red") +
#scale_y_continuous(trans='log') +
facet_wrap(feature)
feature = 'Alley'
ames_train %>%
#mutate(feature = as.factor(feature)) %>%
group_by(Alley) %>%
summarise(SalePrice = mean(SalePrice)) %>%
ggplot(aes(x = Alley, y = SalePrice)) + geom_col()
#geom_smooth(method = "lm", se = TRUE, color = "red") +
#scale_y_continuous(trans='log') +
#facet_wrap(feature)
library(scales)
plot_pred_by_nom = function(nom_feature, predictor, scale_log = FALSE) {
# x = ames_train[,predictor][[1]]
# #x2 = ames_train[,nom_feature][[1]]
# model = lm(ames_train$SalePrice ~ x)
# predicted = predict(model)
g = ames_train %>%
mutate(nom_feature = as.factor(nom_feature)) %>%
group_by(nom_feature) %>%
ggplot(aes_string(x = predictor, y = 'SalePrice'), environment=environment()) +
geom_point(shape=1,alpha=0.4) +
geom_smooth(method = "lm", se = TRUE, color = "red", linetype='dashed') +
# geom_segment(aes(xend = x, yend=predicted), alpha=0.2) +
# geom_point(aes(color=abs(model$residuals))) +
# scale_color_continuous(low = "black", high = "red") +
# guides(color=FALSE) +
# geom_point(aes(y=predicted),shape=1, alpha=0.2) +
facet_wrap(nom_feature) +
labs(title = paste0(predictor,' v Sale Price for given ',nom_feature)) +
theme_bw()
model_name = paste0('SalePrice~',predictor)
if(scale_log){
g = g +
scale_y_continuous(trans='log') +
labs(title=paste0(predictor,' v Log Sale Price for given ',nom_feature))
model_name = paste0('Log_SalePrice~',predictor)
}
file_name = paste0("./Linear Model/Ordinals/",nom_feature,'/',model_name,'.pdf')
dir.create(paste0("./Linear Model/Ordinals/",nom_feature))
# Save Scatter Plot
ggsave(file_name, g)
#dev.off()
print(g)
}
plot_pred_by_nom('MSSubClass',"TotalSF", scale_log=F)
feature = 'HasPool'
ames_train %>%
mutate(feature = as.factor(feature)) %>%
#group_by(feature) %>%
ggplot(aes(x = TotalSF, y = SalePrice)) + geom_point() +
geom_smooth(method = "lm", se = TRUE, color = "red") +
facet_grid(feature) +
scale_y_continuous(trans='log')
feature = 'HasGarage'
ames_train %>%
mutate(feature = as.factor(feature)) %>%
group_by(feature) %>%
ggplot(aes(x = TotalSF, y = SalePrice)) + geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "red") +
facet_wrap(feature) +
scale_y_continuous(trans='log')
boxplot(log(ames_train$SalePrice) ~ ames_train$MoSold, outline = FALSE)
boxplot(ames_train$SalePrice ~ ames_train$YrSold, outline = FALSE)
boxplot(log(ames_train$SalePrice) ~ ames_train$YearBuilt, outline = FALSE)
feature = 'BsmtExposure'
ames_train %>%
mutate(feature = as.factor(feature)) %>%
group_by(feature) %>%
ggplot(aes(x = TotalSF, y = SalePrice)) + geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "red") +
scale_y_continuous(trans='log') +
facet_wrap(feature)
Lets look at Total square footage across different defined sub classes
feature = 'MSSubClass'
g = ames_train %>%
#filter(feature < 70.0) %>%
mutate(feature = as.factor(feature)) %>%
group_by(feature) %>%
ggplot(aes(x = TotalSF, y = SalePrice)) + geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "red") +
scale_y_continuous(trans='log') +
facet_wrap(feature)
ggsave("./Linear Model/MsSubClass_logplot.pdf", g)
feature = 'MSSubClass'
ames_train %>%
filter(AfterWW2==1) %>%
mutate(feature = as.factor(feature)) %>%
group_by(feature) %>%
ggplot(aes(x = GrLivArea, y = SalePrice)) + geom_point() +
geom_smooth(method = "lm", se = TRUE, color = "red") +
#geom_spline(color = "blue") +
#scale_y_continuous(trans='log') +
facet_wrap(feature)
ggplot(aes(x, abs(model$residuals)),
xlab = x_name,
ylab = 'Residual Values Squared',
main = 'Squared Residuals compared to MSE')
abline(h=sqrt(mean(model$residuals^2)), lty = 2, lwd=3, col = 'red')
ss = smooth.spline(x, model$residuals^2, cv = TRUE)
lines(ss, lwd=2, col='blue')
abline(h=mean(ss$y), lwd=2, col='black')
model <- eval(bquote(lm(.(f), data = ames_train)))
library(broom)
# Steps 1 and 2
d <- lm(mpg ~ hp, data = mtcars) %>%
augment()
head(d)
# Steps 3 and 4
ggplot(d, aes(x = hp, y = mpg)) +
geom_smooth(method = "lm", se = FALSE, color = "lightgrey") +
geom_segment(aes(xend = hp, yend = .fitted), alpha = .2) + # Note `.fitted`
geom_point(aes(alpha = abs(.resid))) + # Note `.resid`
guides(alpha = FALSE) +
geom_point(aes(y = .fitted), shape = 1) + # Note `.fitted`
theme_bw()
# The new line of code
model = lm(log(SalePrice) ~ log(TotalSF), data=ames_train)
predicted = predict(model)
g = ames_train %>%
mutate(SalePrice = log(SalePrice), TotalSF = log(TotalSF)) %>%
ggplot(aes(x=TotalSF,y=SalePrice)) +
#geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "lightgrey") +
geom_segment(aes(xend = TotalSF, yend=predicted), alpha=0.2) +
geom_point(aes(color=abs(model$residuals))) +
scale_color_continuous(low = "black", high = "red") +
guides(color=FALSE) +
geom_point(aes(y=predicted),shape=1) +
theme_bw()
# Construction of Confidence Intervals
clower = predict(model, interval='confidence', level=0.95)[,2]
cupper = predict(model, interval='confidence', level=0.95)[,3]
plower = predict(model, interval='prediction', level=0.95)[,2]
pupper = predict(model, interval='prediction', level=0.95)[,3]
g + geom_ribbon(aes(ymin = clower, ymax = cupper), alpha=0.3, col = 'red') +
geom_ribbon(aes(ymin=plower, ymax = pupper), alpha=0.2, col='blue')
Definitely a logarithimic relationship…
qqnorm(model$residuals)
qqline(model$residuals)
ames_train %>%
ggplot(aes(x = TotalSF, y = SalePrice)) +
#geom_point() +
geom_hex(bins=55)
According to Akaike Criterion
library(MASS)
model.empty = mgcv::gam(log(SalePrice) ~ 1, data = ames) #The model with an intercept ONLY.
model.full = ames.gam #The model with ALL variables.
scope = list(lower = formula(model.empty), upper = formula(model.full))
forwardAIC = step(model.empty, scope, direction = "forward", k = 2)
backwardAIC = step(model.full, scope, direction = "backward", k = 2)
bothAIC.empty = step(model.empty, scope, direction = "both", k = 2)
bothAIC.full = step(model.full, scope, direction = "both", k = 2)
summary(bothAIC.empty)
library(mgcv)
set.seed(0);n <- 400
dat <- gamSim(1,n=n,scale=2)
attach(dat)
## Note the increased gamma parameter below to favour
## slightly smoother models...
b<-mgcv::gam(y~ s(x0,bs="ts")+
s(x1,bs="ts")+
s(x2,bs="ts")+
s(x3,bs="ts"),
gamma=1.4)
summary(b)
plot(b,pages=1)
## Same again using REML/ML
b<-gam(y~s(x0,bs="ts")+s(x1,bs="ts")+s(x2,bs="ts")+
s(x3,bs="ts")+s(x4,bs="ts")+s(x5,bs="ts"),method="REML")
summary(b)
plot(b,pages=1)
## And once more, but using the null space penalization
b<-gam(y~s(x0,bs="cr")+s(x1,bs="cr")+s(x2,bs="cr")+
s(x3,bs="cr")+s(x4,bs="cr")+s(x5,bs="cr"),
method="REML",select=TRUE)
summary(b)
plot(b,pages=1)
detach(dat);rm(dat)
influencePlot(bothAIC.full)
make_pred = function(model, data) {
data = data[,attr(model$terms, 'term.labels')]
pred.band = predict(model, data, interval='prediction')
}
sapply(num_train[rownames(inf),attr(bothAIC.full$terms, 'term.labels')],exp) %>% data.frame()
summary(bothAIC.empty)
features = c('YearBuilt','YearRemodAdd','BsmtFullBath','BsmtHalfBath','Full','SalePrice')
train_dis = train[,features]
#test_dis = test[,features]
clean_data_str = function(date_str) {
mdy = mdy(date_str)
ymd = ymd(date_str)
if(is.na(mdy)){
if(is.na(ymd)){
return(NA)
}
else {return(ymd)}
}
else {return(mdy)}
}
train_dis %>% ggplot(aes(x = YearBuilt, y = SalePrice)) + geom_point()
Split into test and train
library(tidyverse)
x = model.matrix(SalePrice ~ ., ames)[, -1]
y = ames$SalePrice
set.seed(0)
train = sample(1:nrow(x), 7*nrow(x)/10)
y.test = y[-train]
library(caret)
set.seed(0)
grid = 10^seq(5, -2, length = 100)
cv.lasso.out = cv.glmnet(x[train,],y[train], lambda=grid, alpha=1, nfolds=10)
plot(cv.lasso.out)
bestlambda.lasso = cv.lasso.out$lambda.min
bestlambda.lasso
log(bestlambda.lasso)
lasso.bestlambdatrain = predict(lasso.models.train, s = bestlambda.lasso, newx = x[-train,])
mean((lasso.bestlambdatrain - y.test)^2)
tmp_coeffs <- coef(cv.glmnet.fit, s = "lambda.min")
data.frame(name = tmp_coeffs@Dimnames[[1]][tmp_coeffs@i + 1], coefficient = tmp_coeffs@x)
coefs.lasso = coef(cv.lasso.out)
str(coefs.lasso)
coefs.lasso@x
gam1 = mgcv::gam(SalePrice ~ s(TotalSF, bs='ps', sp=0.6) + s(KitchenAbvGr, bs='ps', sp=0.6), data=ames_train)
### GAM example using mgcv
library(mgcv)
library(ggplot2)
# fake data
n <- 50
sig <- 2
dat <- gamSim(1,n=n,scale=sig)
# P-spline smoothers (with lambda=0.6) used for x1 and x2; x3 is parametric.
b1 <- mgcv::gam(y ~ s(x1, bs='ps', sp=0.6) + s(x2, bs='ps', sp=0.6) + x3, data = dat)
summary(b1)
plot(b1)
# plot the smooth predictor function for x1 with ggplot to get a nicer looking graph
p <- predict(b1, type="lpmatrix")
beta <- coef(b1)[grepl("x1", names(coef(b1)))]
s <- p[,grepl("x1", colnames(p))] %*% beta
ggplot(data=cbind.data.frame(s, dat$x1), aes(x=dat$x1, y=s)) + geom_line()
# predict
newdf <- gamSim(1,n=n,scale=sig)
f <- predict(b1, newdata=newdf)
# select smoothing parameters with REML, using P-splines
b2 <- mgcv::gam(y ~ s(x1, bs='ps') + s(x2, bs='ps') + x3, data = dat, method="REML")
# select variables and smoothing parameters
b3 <- mgcv::gam(y ~ s(x0) + s(x1) + s(x2) + s(x3) , data = dat, method="REML", select=TRUE)
# loess smoothers with the gam package (restart R before loading gam)
library(gam)
b4 <- gam::gam(y ~ lo(x1, span=0.6) + lo(x2, span=0.6) + x3, data = dat)
summary(b4)
feature = 'OverallQual'
df = ames_train %>% group_by(AfterWW2,HouseStyle) %>% summarise(price_med = median(SalePrice), n =n())
ames_train %>%
ggplot(aes(x = factor(HouseStyle), y = SalePrice)) + geom_boxplot() +
geom_text(data=df, color='red',size=4,aes(y=price_med*0.8, label=n)) +
facet_grid(.~factor(AfterWW2)) +
labs() +
#scale_y_continuous(trans='log') +
theme_bw()
model.fire = lm(log(SalePrice) ~ sqrt(GrLivArea) + sqrt(TotalBsmtSF) + OverallCond + AfterWW2 + Fireplaces + IsNew + OverallQual, data = ames)
summary(model.fire)
plot(model.fire)
inf = influencePlot(model.fire)
ames[rownames(inf),]
model = lm(log(SalePrice) ~ TotalBsmtSF + GrLivArea, data = ames)
# Create a vector of gradually-changing colors, with one entry for each data # point
the.colors <- rainbow(n = nrow(df))
# For each data point, see how it ranks according to X2, from smallest (1)
# to largest
the.ranks <- rank(ames$GrLivArea)
# Plot residuals vs. X1, colored according to X2 Defining the color and rank
# vectors makes this next line a bit less mysterious, but it's not
# necessary; this could all be a one-liner.
plot(ames$TotalBsmtSF, residuals(model), pch = 19, col = the.colors[the.ranks], ylab = "Residuals")
hist(sqrt(ames$TotalBsmtSF))
hist(ames$TotalBsmtSF)
hist(log(ames$SalePrice))
library(tidyverse)
model.test = lm(SalePrice ~ TotalSF + AfterWW2, data = ames)
res = data.frame(cbind(ames$AfterWW2, residuals(model.test)))
colnames(res) = c('AfterWW2','Residuals')
ggplot(data = res, aes(x = factor(AfterWW2), y = Residuals)) + geom_boxplot()
ggplot(data = res, aes(x=Residuals)) + geom_density(alpha=.2, fill="#FF6666") + facet_grid(res$AfterWW2, scales = 'free_y')
ames %>%
filter(YearBuilt > 1990) %>%
group_by(YearBuilt,CentralAir) %>%
tally() %>%
ggplot(aes(x = YearBuilt, y=n, fill=CentralAir)) + geom_col()
ames %>%
filter(YearBuilt > 1990) %>%
group_by(YearBuilt,MSSubClass) %>%
tally() %>%
ggplot(aes(x = YearBuilt, y=n, fill=factor(MSSubClass))) + geom_col()
hist(ames$SalePrice)
b = c(-Inf, 1.2e5, 2e5,5e5,Inf)
names = c('Cheap','Average','Expensive','Very Expensive')
bins = cut(ames$SalePrice, breaks = b, labels = names)
ames %>%
cbind(., bins) %>%
filter(YearBuilt > 1930) %>%
group_by(YearBuilt,bins) %>%
tally() %>%
ggplot(aes(x = YearBuilt, y=n, fill=factor(bins))) + geom_col()
ames %>%
group_by(YrSold,YearBuilt) %>%
tally() %>%
ggplot(aes(x = YrSold, y=n, fill=YearBuilt)) + geom_col(position='fill')
model.test = lm(log(SalePrice) ~ sqrt(TotalSF)*MSSubClass,data=ames )
summary(model.test)
library(AppliedPredictiveModeling)
transparentTheme(trans = .4)
library(caret)
plotSubset <- data.frame(scale(ames[, c("SalePrice", "MSSubClass")]))
xyplot(SalePrice ~ MSSubClass,
data = ames,
groups = ames$AfterWW2,
auto.key = list(columns = 2))
transformed <- spatialSign(plotSubset)
transformed <- as.data.frame(transformed)
xyplot(SalePrice ~ MSSubClass,
data = transformed,
groups = ames$AfterWW2,
auto.key = list(columns = 2))
ames_train %>%
group_by(MSSubClass,AfterWW2) %>%
ggplot(aes(x = factor(MSSubClass), y=SalePrice)) + geom_boxplot() +facet_grid(('AfterWW2'))
ames_train %>%
group_by(MSSubClass,HouseStyle) %>%
ggplot(aes(x = factor(MSSubClass), y=SalePrice)) + geom_boxplot() +facet_wrap(('HouseStyle'))
d.tree = rpart::rpart(formula = SalePrice ~ Neighborhood, data = ames)
rpart.plot::rpart.plot(d.tree)
library(tree.bins)
library(rpart)
sample.df <- ames[,c('Neighborhood','SalePrice','MSZoning')]
binned.df <- tree.bins(data = sample.df, y = SalePrice)#, bin.nm = "bin.", return = "new.fctrs")
unique(binned.df$Neighborhood)#current levels of Neighborhood
sample.df <- tree.bins::AmesImpFctrs[, c("Neighborhood", "MS.Zoning", "SalePrice")]
binned.df = tree.bins(data = sample.df, y = SalePrice, bin.nm = "bin.", return = "lkup.list")
unique(binned.df$Neighborhood)#current levels of Neighborhood
ames = ames %>% left_join(., binned.df[[1]][,c('Neighborhood','Categories')], by='Neighborhood')
ames = ames %>% left_join(., binned.df[[2]][,c('MS.Zoning','Categories')], by=c('MSZoning'='MS.Zoning'))
test[!complete.cases(test),]
sum(is.na(test))
#test.binn = test %>% select(c('Neighborhood',''))
test.binn = kknn(PriceRange ~
Neighborhood +
TotalBathrooms +
KitchenAbvGr +
OverallQual +
ExterQual +
MSZoning, complete, missing, k=1)
ames_noSale = ames %>% select(-SalePrice)
all = rbind(ames_noSale, test)
imputed_Data <- mice(test, m=5, maxit = 20, method = 'pmm', seed = 500);
completeData = complete(imputed_Data,1)
test.comp = tibble(completeData)
test.comp['preds'] = gam.base.predictions
test.comp %>%
ggplot(aes(x = 1:nrow(test.comp), y = preds, col = PriceRange)) + geom_point()